In [93]:
import statsmodels.api as sm
import numpy as np
from sklearn.linear_model import LinearRegression
In [102]:
df0 = pd.read_csv("./nyc.csv")
df0.head(1)
Out[102]:
In [ ]:
In [63]:
# 불필요값(Case), String값 제거
dfx = df.drop(["Case", "Price", "Restaurant"], axis=1)
In [64]:
dfx.head(1)
Out[64]:
In [65]:
# Scailing 하기
from sklearn.preprocessing import StandardScaler
In [66]:
scaler = StandardScaler()
scaler.fit(dfx)
dfX = scaler.transform(dfx)
In [67]:
dfX = pd.DataFrame(dfX, columns = ["Food", "Decor", "Service", "East"])
In [68]:
# constant항 추가
dfX = sm.add_constant(dfX)
In [69]:
dfy = df["Price"]
dfy = pd.DataFrame(dfy)
In [89]:
# df 합치기 (Target=Price)
df = pd.concat([dfX, dfy], axis=1)
df.head(1)
Out[89]:
In [92]:
# model 작성 (from_formula)
import statsmodels.formula.api as smf
In [91]:
model = smf.ols(formula="Price~Food+Decor+Service+East", data=df)
In [87]:
result = model.fit()
In [88]:
print(result.summary())
In [98]:
# anova table 그리기
table = sm.stats.anova_lm(result)
In [97]:
table
Out[97]:
In [100]:
sns.jointplot("East", "Price", data=df)
Out[100]:
In [200]:
# East 의 P-value = 0.03 이므로 유의수준 1% 대비 높기 때문에 의미는 있음(0이 아님)
# 추가 프리미엄 = + 3%
In [ ]:
In [ ]:
In [202]:
df20 = pd.read_csv("cars04.csv")
df20.head(1)
Out[202]:
In [203]:
df2 = df20[["EngineSize", "Cylinders", "Horsepower", "HighwayMPG", "Weight", "WheelBase", "Hybrid", "SuggestedRetailPrice"]]
In [204]:
df2.head(1)
Out[204]:
In [205]:
# scaling 안한 모델 작성
df1 = sm.add_constant(df2)
df1.head(1)
Out[205]:
In [206]:
model1 = sm.OLS(df1.ix[:,-1], df1.ix[:,:-1])
In [207]:
result1 = model1.fit()
In [208]:
print(result.summary())
In [209]:
# scaling 진행
In [210]:
scaler.fit(df2)
df = scaler.transform(df2)
In [211]:
df = pd.DataFrame(df, columns = ["EngineSize", "Cylinders", "Horsepower", "HighwayMPG", "Weight", "WheelBase", "Hybrid", "SuggestedRetailPrice"])
In [212]:
df.head(1)
Out[212]:
In [213]:
df2 = sm.add_constant(df)
In [214]:
model2 = sm.OLS(df2.ix[:,-1], df2.ix[:,:-1])
In [215]:
result2 = model2.fit()
In [216]:
# Scailing한 모델 : 안한것 보다 R-Squared 값 상승 확인함
print(result2.summary())
In [217]:
# Hybrid, WheelBase 제거 후 모델 다시 만들기
df3 = df2.drop(["Hybrid", "WheelBase"], axis=1)
In [218]:
model3 = sm.OLS(df3.ix[:,-1], df3.ix[:,:-1])
result3 = model3.fit()
In [220]:
print(result3.summary()) # Adj. R-squared 값 0.002 상승함...
In [142]:
# R-squared가 0.8 미만이므로 나머지 data 확인
In [174]:
sns.jointplot("HighwayMPG","SuggestedRetailPrice", data = df3 )
Out[174]:
In [170]:
# anova 확인
model4 = sm.OLS.from_formula("SuggestedRetailPrice~EngineSize+Cylinders+Horsepower+HighwayMPG+Weight", data=df3)
In [171]:
result4 = model4.fit()
In [172]:
table = sm.stats.anova_lm(result4)
In [198]:
table
Out[198]:
In [244]:
# Highway MGP 변환이 필요함... 다항회귀로 model 작성!
In [264]:
model5 = sm.OLS.from_formula("SuggestedRetailPrice~ HighwayMPG + I(HighwayMPG**2)+EngineSize+Cylinders+Horsepower+Weight+I(Weight**2)", data=df3)
In [265]:
result5 = model5.fit()
In [266]:
print(result5.summary())
In [267]:
# Adj. R-Squared는 맞췄으나 정규분포가 아님... 과최적화 됨
In [ ]: